# import pandas as pdimport torchimport torch.nn as nnimport torch.optim as optimimport polars as plimport numpy as npfrom lets_plot import*from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.inspection import permutation_importancefrom sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixinfrom sklearn.metrics import ( classification_report, accuracy_score, recall_score, precision_score, f1_score, r2_score, mean_absolute_error, mean_squared_error)LetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URLurl1 ="https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv"url2 ="https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_neighborhoods_ml/dwellings_neighborhoods_ml.csv"url3 ="https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_denver/dwellings_denver.csv"url4 ="https://github.com/byuidatascience/data4dwellings/blob/master/data.md"df = pl.read_csv('dwellings_ml.csv')
QUESTION
Build a classification model labeling houses as being built “before 1980” or “during or after 1980”. Your goal is to reach or exceed 90% accuracy. Report your final model choice and any other model parameters you may have tweaked (train-test split ratio, tuning parameters, etc).
The model chosen was the Gradient Boost Classifier. Most parameters were not changed beyond a test_size of 0.2 and a random state of 117.
Show the code
# Include and execute your code hereX = df.drop(["parcel", "before1980", "yrbuilt"])y = df.select(["before1980"])X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)model_xgb = GradientBoostingClassifier()model_xgb.fit(X_train, y_train)
GradientBoostingClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier()
Show the code
pred = model_xgb.predict(X_test)print(classification_report(y_test, pred))
# Check whether CUDA is availableif (torch.cuda.is_available()):print(f"CUDA is available, using the GPU")print(torch.cuda.get_device_name(0)) # GPU name# print(torch.cuda.current_device()) device = torch.device("cuda")else:print(f"CUDA is not available, using the CPU") device = torch.device("cpu")# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")# print(f"Using device: {device}")
CUDA is available, using the GPU
NVIDIA GeForce RTX 4060 Laptop GPU
Show the code
# Converting to numpy for PyTorchX_nn = X.to_numpy()y_nn = y.to_numpy()scaler = StandardScaler()X_nn = scaler.fit_transform(X_nn)X_nn_trn, X_nn_tst, y_nn_trn, y_nn_tst = train_test_split(X_nn, y_nn, test_size=0.2, random_state=117)# Converting to PyTorch TensorsX_trn_tensor = torch.tensor(X_nn_trn, dtype=torch.float32)y_trn_tensor = torch.tensor(y_nn_trn, dtype=torch.float32).view(-1, 1)X_tst_tensor = torch.tensor(X_nn_tst, dtype=torch.float32)y_tst_tensor = torch.tensor(y_nn_tst, dtype=torch.float32).view(-1, 1)if torch.cuda.is_available():print(f"Moving Tensors to the GPU") X_trn_tensor = X_trn_tensor.to(device) y_trn_tensor = y_trn_tensor.to(device) X_tst_tensor = X_tst_tensor.to(device) y_tst_tensor = y_tst_tensor.to(device)# You must use LongTensors when doing multi classification with CrossEntropyLoss()# ie..., Floats and Strings are not allowed here# X_trn_tensor = torch.tensor(X_nn_trn, dtype=torch.long)# If dealing with strings convert them to an index ie...# class_names = ["cat", "dog", "bird"]# labels = ["dog", "cat", "bird", "cat"]# class_to_index = {name: idx for idx, name in enumerate(class_names)}# indices = [class_to_index[label] for label in labels]# y_trn_tensor = torch.tensor(indices, dtype=torch.long)
Moving Tensors to the GPU
Show the code
# Defining a simple nueral network# Inherit from the nn.Module from PyTorchclass Net(nn.Module):def__init__(self, input_size):super(Net, self).__init__() # Initializes the parent class so all of its internal machinery is set upself.fc1 = nn.Linear(input_size, 16) # First fully connected layer, maps the input to 16 neuronsself.relu = nn.ReLU() # Desigend to introduce non-linearity, it also replaces negative vlaues with 0, can help the network learn complex patternsself.fc2 = nn.Linear(16,1) # This is the second and final fully connected layer mapping the previous layer from 16 neurons back down to 1 which is a typical output for regression tasks or binary classification logits# use this final layer instead of the previous for multi classification and add num_classes as an input parameter to the __init__ constructor# self.fc2 = nn.Linear(16, num_classes)# This defines how the data will flow through the network, x is the input tensordef forward(self, x):# Simply passing the input through each layer and activation functions (relu) x =self.fc1(x) x =self.relu(x) x =self.fc2(x)return x# Adding a classifier for measuring importance# class TorchClassifier(BaseEstimator, ClassifierMixin):# def __init__(self, model, device):# self.model = model# self.device = device# def fit(self, X, y):# return self# def predict(self, X):# self.model.eval()# with torch.no_grad():# X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)# logits = self.model(X_tensor)# if logits.shape[1] == 1:# probs = torch.sigmoid(logits).cpu().numpy()# return (probs >= 0.5).astype(int).flatten()# else:# return torch.argmax(logits, dim=1).cpu().numpy()class TorchModelWrapper(BaseEstimator):def__init__(self, model, device, task="classification", scorer="f1_score"):self.model = modelself.device = deviceself.task = task # 'classification' or 'regression'self.scorer = scorerdef fit(self, X, y):returnselfdef predict(self, X):self.model.eval()with torch.no_grad(): X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device) outputs =self.model(X_tensor)ifself.task =="regression":return outputs.cpu().numpy().flatten()elifself.task =="classification":if outputs.shape[1] ==1: probs = torch.sigmoid(outputs).cpu().numpy()return (probs >=0.5).astype(int).flatten()else:return torch.argmax(outputs, dim=1).cpu().numpy()else:raiseValueError(f"Unsupported task type: {self.task}\nOnly two types of tasks are available: regression and classification (default)")def score(self, X, y): y_pred =self.predict(X)ifself.scorer =="f1_score":return f1_score(y, y_pred)elifself.scorer =="accuracy_score":return accuracy_score(y, y_pred)elifself.scorer =="MSE":return mean_squared_error(y, y_pred)elifself.scorer =="MAE":return mean_absolute_error(y, y_pred)elifself.scorer =="r2_score":return r2_score(y, y_pred)elifself.scorer =="recall_score":return recall_score(y, y_pred)elifself.scorer =="precision_score":return precision_score(y, y_pred)else:raiseValueError(f"Unsupported scoring method: {self.scorer}")
Show the code
model_nn = Net(input_size=X_nn.shape[1])if torch.cuda.is_available():print(f"Moving the model to the GPU") model_nn = model_nn.to(device)# Use this for regression -- Other loss functions can be used instead of MSE# criterion = nn.MSELoss()# Use this for binary classificationcriterion = nn.BCEWithLogitsLoss()# Use this for multi classification# criterion = nn.CrossEntropyLoss()optimizer = torch.optim.Adam(model_nn.parameters(), lr=0.01)
# Evaluating the modelif torch.cuda.is_available():# For classification (binary and multiple) wrapped_model = TorchModelWrapper(model=model_nn, device=device, task="classification")# For regression, the rest is the same for any of the tasks# wrapped_model = TorchModelWrapper(model=model_nn, device=device, task="regression")# Feature importance result = permutation_importance( wrapped_model, X_nn, y_nn, n_repeats=10, random_state=117, n_jobs=-1 ) importances = result.importances_mean stds = result.importances_std indices = np.argsort(importances)[::-1] nn_res = pl.DataFrame({"Feature": np.array(X.columns)[indices], "Importance": importances[indices], "Std": stds[indices]})else: model_nn.eval()# use this for regression# with torch.no_grad():# preds = model_nn(X_tst_tensor)# test_loss = criterion(preds, y_tst_tensor)# print(f"Test Loss: {test_loss.item():.4f}")# use this for binary classificationwith torch.no_grad(): probs = torch.sigmoid(model_nn(X_tst_tensor)) # Used to squash the output between 0 and 1 for binary classification by doing probability preds = (probs >=0.5).int().cpu().numpy() if torch.cuda.is_available() else (probs >=0.5).float() # Sets the threshold at 0.5 for binary classification y_true = y_tst_tensor.int().cpu().numpy() if torch.cuda.is_available() else y_tst_tensor# use this for multi classification# with torch.no_grad():# outputs = model_nn(X_tst_tensor)# preds = torch.argmax(outputs, dim=1)nn_res = pl.DataFrame({"Feature": np.array(X.columns)[indices], "Importance": importances[indices], "Std": stds[indices]})
Show the code
print(classification_report(y_true, preds))if torch.cuda.is_available(): top_feats = nn_res.sort("Importance").tail(10) feats_bar = ( ggplot(data=top_feats)+ geom_bar(mapping = aes(x ='Importance', y ='Feature', fill='Feature', color='Feature'), stat='identity')+ guides(color="none")+ labs( title="Feature Importance", subtitle="The top 10 most important features are shown that helped the model learn on the data.", x="Importance", y="Feature", fill='Feature' )+ theme( panel_background=element_rect(fill='gray'), plot_background=element_rect(fill='gray'), panel_grid_major=element_rect(fill='gray'), legend_background=element_rect(fill='gray'), axis_text=element_text(color='white'), axis_title=element_text(color='white'), plot_title=element_text(color='white'), plot_subtitle=element_text(color='white'), legend_text=element_text(color='white'), legend_title=element_text(color='white'), label_text=element_text(color='white') )+ ggsize(1600, 900) ) display(feats_bar)else:print("CUDA is not available!")